################################################################################
##
## Purpose: This script prepares the information on the politicians.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/raw/politicians/1976-2020-house.csv: Raw data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2
##    - ./data/raw/politicians/1976-2020-senate.csv: Raw data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/PEJ5QU
##    - ./data/raw/politicians/new_legs_lookup_full.csv: Raw data from https://github.com/unitedstates/congress-legislators, manually prepared by Bisbee
##    - ./data/raw/politicians/legislators-historical.csv: Raw data from https://github.com/unitedstates/congress-legislators
##    - ./data/raw/politicians/HSall_members.csv: Raw data from VOTEVIEW (https://voteview.com/data)
##    - ./data/prepped/hearings/cleaned_docs.csv: Prepped data from 1_DATA_hearings_prep.R
##  - Outputs:
##    - ./data/prepped/politicians/politician_prepped_new.csv
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(tidyverse)
require(R.utils)
require(openxlsx)

set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


#   Want to JUST get some identifier attached to the vote data. How hard can this be?
house <- read_csv('./data/raw/politicians/1976-2020-house.csv')
senate <- read_csv('./data/raw/politicians/1976-2020-senate.csv')
idlookupFull <- read_csv('./data/raw/politicians/new_legs_lookup_full.csv',col_select = -1)
histIDs <- read_csv('./data/raw/politicians/legislators-historical.csv') # Why TF does the json not have the opensecrets ID?
hearings <- read_csv('./data/prepped/hearings/cleaned_docs.csv',col_select = -1)
ideo <- read_csv('./data/raw/politicians/HSall_members.csv')

# In fact, just need to get the details for 287 of these bastards
toMerge <- hearings %>%
  filter(!is.na(name)) %>%
  select(stab,distIDcurr,name,year,opensecretsID,party,chamber) %>% distinct() %>%
  mutate(lname = gsub('.*? ','',gsub(' (JR\\.|SR\\.|I(I+|V))$','',name)))

# How do we have two different distIDcurr for the same person in the same
#   year? Need to go back to the hearings data creation for this, dammitall.
toMerge %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>%
  filter(n > 1)

for(i in 1:nrow(house)) {
  gsub('.*? ','',gsub('(,)* (JR\\.|SR\\.|I(I+|V))$','',utf8::utf8_encode(house$candidate[i])))
}

house <- house %>%
  filter(!is.na(candidate)) %>%
  group_by(year,candidate,totalvotes,state_po,office,district,stage,special,mode,unofficial,version) %>%
  mutate(candidatevotes = sum(candidatevotes)) %>%
  ungroup() %>%
  mutate(lname = gsub('.*? ','',gsub('(,)* (JR\\.|SR\\.|I(I+|V))$','',utf8::utf8_encode(candidate))),
         votepct = candidatevotes / totalvotes) %>%
  ungroup()

house <- house %>%
  group_by(year,state_po,office,district,stage,special,mode,unofficial,version) %>%
  mutate(votepct_rel = votepct - max(votepct[votepct != max(votepct)])) %>%
  ungroup()


senate <- senate %>%
  filter(!is.na(candidate)) %>%
  group_by(year,candidate,totalvotes,state_po,office,district,stage,special,mode,unofficial,version) %>%
  mutate(candidatevotes = sum(candidatevotes)) %>%
  mutate(lname = gsub('.*? ','',gsub('(,)* (JR\\.|SR\\.|I(I+|V))$','',utf8::utf8_encode(candidate))),
         votepct = candidatevotes / totalvotes) %>%
  ungroup()

senate <- senate %>%
  group_by(year,state_po,office,district,stage,special,mode,unofficial,version) %>%
  mutate(votepct_rel = votepct - max(votepct[votepct != max(votepct)])) %>%
  ungroup()

toMerge$votepct <- toMerge$votepct_rel <- toMerge$strdist <- toMerge$voteName <- NA
for(i in 1:nrow(toMerge)) {
  toMerge[i,] %>% data.frame()
  if(grepl("^FED|ADMIN|EXPERT",toMerge$opensecretsID[i])) { next }
  if(toMerge$chamber[i] == 'Senate') {
    ref <- senate %>% filter(state_po == toMerge$stab[i],
                             lname == toupper(toMerge$lname[i]),
                             year < (toMerge$year[i]))
    # ref %>% data.frame()
    if(nrow(ref) == 0) {
      ref <- senate %>% filter(state_po == toMerge$stab[i],
                               lname == toupper(toMerge$lname[i]))
    }
    tempDist <- abs(ref$year - toMerge$year[i])
    ref <- ref %>%
      slice(which(tempDist == min(tempDist)))
    
    dists <- stringdist::stringdist(toMerge$name[i],ref$candidate)
    
    ref <- ref %>%
      slice(which.min(dists)) %>%
      mutate(strdist = dists[which.min(dists)]) %>%
      select(candidate,candidatevotes,totalvotes,votepct,strdist,votepct_rel)
    if(nrow(ref) > 1) { stop() }
    if(ref$votepct_rel < 0) { stop() }
    toMerge$votepct[i] <- ref$votepct
    toMerge$strdist[i] <- ref$strdist
    toMerge$votepct_rel[i] <- ref$votepct_rel
    toMerge$voteName[i] <- ref$candidate
  }
  
  if(toMerge$chamber[i] == 'House') {
    if(!toMerge$stab[i] %in% state.abb) { next }
    
    ref <- house %>% filter(state_po == toMerge$stab[i],
                            lname == toupper(toMerge$lname[i]),
                            year < (toMerge$year[i]))
    ref %>% data.frame()
    if(nrow(ref) == 0) {
      ref <- house %>% filter(state_po == toMerge$stab[i],
                              lname == toupper(toMerge$lname[i]))
    }
    if(nrow(ref) == 0) {
      ref <- house %>% 
        mutate(stdist = paste0(state_po,sprintf('%02d',district))) %>%
        filter(stdist == toMerge$distIDcurr[i])
    }
    tempDist <- abs(ref$year - toMerge$year[i])
    ref <- ref %>%
      slice(which(tempDist == min(tempDist)))
    
    dists <- stringdist::stringdist(toMerge$name[i],ref$candidate)
    
    ref <- ref %>%
      slice(which.min(dists)) %>%
      mutate(strdist = dists[which.min(dists)]) %>%
      select(candidate,candidatevotes,totalvotes,votepct,strdist,votepct_rel)
    if(nrow(ref) > 1) { stop() }
    if(ref$votepct_rel < 0) { stop() }
    toMerge$votepct[i] <- ref$votepct
    toMerge$strdist[i] <- ref$strdist
    toMerge$votepct_rel[i] <- ref$votepct_rel
    toMerge$voteName[i] <- ref$candidate
  }
}

# 100% perfect! Screw Leip and moving to a new R script for cleanliness! (98 are fuzzy matches, but
#   all of them are correct based on manual inspection)
toMerge %>%
  select(distIDcurr,name,voteName,votepct,strdist) %>%
  arrange(-strdist) %>%
  data.frame()


# So now let's merge with the rest of the politician-specific data
# Only 14 speakers don't match

unmatched <- toMerge %>%
  filter(!grepl('FED|EXPERT|ADMIN',opensecretsID)) %>%
  left_join(idlookupFull %>%
              select(year,bioguide,opensecretsID = opensecrets,
                     first,last,birthday,gender,state,district,nickname,
                     yearStart)) %>%
  filter(is.na(birthday))



matched <- toMerge %>%
  filter(!grepl('FED|EXPERT|ADMIN',opensecretsID)) %>%
  left_join(idlookupFull %>%
              select(year,bioguide,opensecretsID = opensecrets,
                     first,last,birthday,gender,state,district,nickname,
                     yearStart)) %>%
  filter(!is.na(birthday))

# All of the unmatched are either from 2001 or 2002? wtf?
unmatched %>%
  select(year,name,distIDcurr,stab,opensecretsID)

# It seems to simply be because these don't have an opensecrets ID in the lookup data
idlookupFull %>%
  filter(year == 2001) %>%
  filter(state == 'TX') %>%
  select(bioguide,opensecrets,first,last) %>%
  data.frame()

# Manually assigning the correct opensecrets IDs to these guys
idlookupFull <- idlookupFull %>%
  mutate(opensecrets = ifelse(is.na(opensecrets) & bioguide=='L000556','N00001305',
                              ifelse(is.na(opensecrets) & bioguide == 'G000365','N00005709',
                                     ifelse(is.na(opensecrets) & bioguide == 'R000465','N00000740',
                                            ifelse(is.na(opensecrets) & bioguide == 'M000212','N00001339',
                                                   ifelse(is.na(opensecrets) & bioguide == 'R000258','N00003071',
                                                          ifelse(is.na(opensecrets) & bioguide == 'G000547','N00012774',
                                                                 ifelse(is.na(opensecrets) & bioguide == 'B000400','N00005805',opensecrets))))))))


# Make "yearStart" the first year they enter congress
idlookupMerge <- idlookupFull %>%
  select(year,bioguide,opensecretsID = opensecrets,
         first,last,birthday,gender,state,district,nickname,
         yearStart) %>%
  filter(opensecretsID %in% toMerge$opensecretsID) %>%
  group_by(opensecretsID,state,district) %>%
  mutate(yearStart = min(yearStart)) %>%
  distinct()

# And perfect match!
unmatched <- toMerge %>%
  filter(!grepl('FED|EXPERT|ADMIN',opensecretsID)) %>%
  left_join(idlookupMerge) %>%
  filter(is.na(birthday))


toMergeFull <- toMerge %>%
  left_join(idlookupMerge) %>%
  mutate(age = year - lubridate::year(as.Date(birthday)),
         seniority = year - yearStart)

# 86 of these merges have multiple observations, presumably due to 
#   the legislator finishing a stint in the House and starting one in the senate 
#   in the same year. Do we therefore want to measure seniority as total time in 
#   Congress writ large, or total time in their specific chamber? I think the
#   former better captures the concept of interest -- their experience serving
#   as legislator. (Although we should probably also want to get their seniority
#   as a member of a specific committee?)
toMergeFull %>%
  group_by(opensecretsID,year) %>%
  mutate(n = n()) %>%
  filter(n > 1) %>%
  data.frame()


idlookupMerge <- idlookupFull %>%
  select(year,bioguide,opensecretsID = opensecrets,
         first,last,birthday,gender,nickname,
         yearStart) %>%
  filter(opensecretsID %in% toMerge$opensecretsID) %>%
  group_by(opensecretsID) %>%
  mutate(yearStart = min(yearStart)) %>%
  distinct()


toMerge %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>%
  filter(n > 1)

toMergeFull <- toMerge %>%
  left_join(idlookupMerge) %>%
  mutate(age = year - lubridate::year(as.Date(birthday)),
         seniority = year - yearStart)

toMergeFull %>%
  distinct() %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>% filter(n > 1)

# Adding in ideology

# Great match EXCEPT for N00042619, the dude from Guam. His voting record is almost identical to 
#   Lisa Blunt Rochester, so I'll just give him her ideology: https://projects.propublica.org/represent/members/S001204-michael-san-nicolas/compare-votes/B001303-lisa-blunt-rochester/116
toMergeFull %>%
  left_join(ideo %>%
              select(icpsr,bioguide = bioguide_id,nominate_dim1,nominate_dim2) %>%
              distinct()) %>%
  select(name,opensecretsID,nominate_dim1) %>%
  filter(is.na(nominate_dim1)) %>% data.frame() %>%
  distinct()

# Weird...so there are two people whose nominate scores change between the 
#   112 and 114 congresses. I'm just going to select the nominate score
#   associated with fewer errors. 
ideo %>%
  filter(congress >= 106) %>%
  # select(icpsr,bioguide = bioguide_id,nominate_dim1,nominate_dim2) %>%
  distinct() %>%
  filter(!is.na(bioguide_id)) %>%
  group_by(bioguide_id) %>%
  summarise(nominate_dim1 = mean(nominate_dim1),
            nominate_dim2 = mean(nominate_dim2),.groups = 'drop') %>%
  filter(bioguide_id %in% c('D000613','G000570')) %>% data.frame()

toMergeFull <- toMergeFull %>%
  left_join(ideo %>%
              filter(congress >= 106) %>%
              select(bioguide = bioguide_id,nominate_dim1,nominate_dim2) %>%
              distinct() %>%
              filter(!is.na(bioguide)) %>%
              group_by(bioguide) %>%
              summarise(nominate_dim1 = mean(nominate_dim1),
                        nominate_dim2 = mean(nominate_dim2),.groups = 'drop')) %>%
  mutate(nominate_dim1 = ifelse(opensecretsID == 'N00042619',-.334,nominate_dim1))

# No duplicates
toMergeFull %>%
  distinct() %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>% filter(n > 1) %>%
  data.frame()

# So we DO want to do something about missing data for gender, ideology, age, and seniority for the
#   Fed chairs and experts to ensure we don't drop them from all regressions.

# Age and seniority is easy enough for the Fed chairs, a bit trickier for experts. 
#   We assume that Fed chairs and experts are non-partisan and non-ideological. 
#   This assumption reflects the normative aspirations of their role at committee hearings, 
#   and not the actual views of the authors. (Many of the experts invited to the 2014-02-11 hearing
#   are clearly Republican affiliated and ideologically conservative.)
toMergeFull %>%
  filter(is.na(nominate_dim1)) %>%
  select(name,opensecretsID) %>%
  distinct()

hearings %>%
  filter(grepl('McCloskey',speaker)) %>% select(docID)

toMergeFull <- toMergeFull %>%
  mutate(age = ifelse(opensecretsID == 'FEDYELLEN',year - 1946,
                      ifelse(opensecretsID == 'FEDGREENSPAN',year - 1926,
                             ifelse(opensecretsID %in% c('FEDBERNANKE','FEDPOWELL'),year - 1953,
                                    ifelse(opensecretsID == 'EXPERTMATTINGLY',year - 1944, # https://prabook.com/web/j.virgil.mattingly/3630794
                                           ifelse(opensecretsID == 'EXPERTKOHN',year - 1942, # https://en.wikipedia.org/wiki/Donald_Kohn
                                                  ifelse(opensecretsID == 'EXPERTKOO',year - 1954, # https://en.wikipedia.org/wiki/Richard_Koo 
                                                         ifelse(opensecretsID == 'EXPERTMELTZER',year - 1928, # https://en.wikipedia.org/wiki/Allan_H._Meltzer
                                                                ifelse(opensecretsID == 'EXPERTMISHEL',year - 1953, # https://prabook.com/web/lawrence.mishel/3469231
                                                                       ifelse(opensecretsID == 'EXPERTTAYLOR',year - 1946, # https://en.wikipedia.org/wiki/John_B._Taylor
                                                                              ifelse(opensecretsID == 'EXPERTCALABRIA',year - 1969, # https://en.wikipedia.org/wiki/Mark_A._Calabria
                                                                                     ifelse(opensecretsID == 'EXPERTMCCLOSKEY',year - 1987, # https://static1.squarespace.com/static/5627d283e4b0458929042617/t/582b5edb5016e110994e6986/1479237340032/Abigail+McCloskey+CV.pdf (best guess based on BA tenure)
                                                                                     age))))))))))), 
         seniority = ifelse(opensecretsID == 'FEDYELLEN',year - 2014,
                            ifelse(opensecretsID == 'FEDGREENSPAN',year - 1987,
                                   ifelse(opensecretsID == 'FEDBERNANKE',year - 2006,
                                          ifelse(opensecretsID == 'FEDPOWELL',year - 2018,
                                                 ifelse(opensecretsID == 'EXPERTMATTINGLY',year - 1989, # https://prabook.com/web/j.virgil.mattingly/3630794
                                                        ifelse(opensecretsID == 'EXPERTKOHN',year - 2002, # https://en.wikipedia.org/wiki/Donald_Kohn
                                                               ifelse(opensecretsID == 'EXPERTKOO',year - 1997, # https://en.wikipedia.org/wiki/Richard_Koo
                                                                      ifelse(opensecretsID == 'EXPERTMELTZER',year - 1999, # https://en.wikipedia.org/wiki/Allan_H._Meltzer
                                                                             ifelse(opensecretsID == 'EXPERTMISHEL',year - 1987, # https://en.wikipedia.org/wiki/Lawrence_Mishel
                                                                                    ifelse(opensecretsID == 'EXPERTTAYLOR',year - 1993, # https://en.wikipedia.org/wiki/John_B._Taylor
                                                                                           ifelse(opensecretsID == 'EXPERTCALABRIA',year - 2017, # https://en.wikipedia.org/wiki/Mark_A._Calabria
                                                                                                  ifelse(opensecretsID == 'EXPERTMCCLOSKEY',year - 2013, # https://static1.squarespace.com/static/5627d283e4b0458929042617/t/582b5edb5016e110994e6986/1479237340032/Abigail+McCloskey+CV.pdf (best guess based publication record)
                                                                                                  seniority)))))))))))), 
         gender = ifelse(opensecretsID %in% c("FEDPOWELL","FEDGREENSPAN",'FEDBERNANKE'),'M',
                         ifelse(opensecretsID == 'FEDYELLEN','F',
                                ifelse(opensecretsID == 'EXPERTMCCLOSKEY','F',
                                       ifelse(grepl('EXPERT',opensecretsID),'M',gender)))),
         nominate_dim1 = ifelse(grepl('FED|EXPERT|ADMIN',opensecretsID),0,nominate_dim1),
         nominate_dim2 = ifelse(grepl('FED|EXPERT|ADMIN',opensecretsID),0,nominate_dim2))

# The only remaining speaker for whom we do not have age, seniority, or gender is the Clerk who we will drop.
toMergeFull %>%
  distinct() %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>%
  filter(n > 1) %>% data.frame()


write.csv(toMergeFull,file = './data/prepped/politicians/politician_prepped_new.csv')

# EOF